* Descriptives


global dta "~/Dropbox/PATSTAT_spring2015"
global tmp "/tmp"
cd ~/Dropbox/PATSTAT/DATA/
global wto "_WTO"


* ---------------------------------------
* Descriptives
* ---------------------------------------


* -----------------------------------------------------------------------------
* Preliminary
* -----------------------------------------------------------------------------
// Get all WTO countries
import delimited Ctry_heterog_data/wto_members.csv, varnames(1) encoding(utf8) clear
keep iso3 entry_year
rename iso3 iso_alpha3
save $tmp/wtomembers, replace 

// Get country codes
use $dta/tls801/tls801, clear
replace iso_alpha3 = strtrim(iso_alpha3)
drop if missing(iso_alpha3)
drop if ctry_code=="DL"
/* DDR has 2 country codes in Patstat: DD, DL. Only DD is in our dataset. */
save $tmp/tmp801, replace

// Identify all countries in Patstat
use patentsfam4, clear
contract headq
drop _freq
rename headq ctry_code
merge 1:1 ctry_code using $tmp/tmp801, keep(match master)
// Replace Iso3 code of Taiwan and Romania to merge with WTO code
replace iso_alpha3 = "CHT" if iso_alpha3 == "TWN" & ctry_code == "TW"
replace iso_alpha3 = "ROM" if iso_alpha3 == "ROU" & ctry_code == "RO"
// Make fake iso code for headq without state indicator
replace iso_alpha3 = "X" + ctry_code if iso_alpha3 == "" & _merge == 1
replace state_indicator = "N" if _merge == 1
drop if state_indicator == "N"
drop _merge
rename ctry_code headq
drop continent eu_member epo_member oecd_member discontinued state_indicator

// Identify WTO countries in Patstat
use $tmp/all_countries, clear
merge 1:1 iso_alpha3 using $tmp/wtomembers, keep(match master)
gen wto = _merge == 3
gen wto1995 = _merge == 3 & entry_year == 1995
rename entry_year wto_entry_year
drop _merge
tab wto wto1995
save WTOmembers_countries, replace


* -----------------------------------------------------------------------------
* Table 1 and Appendix Table 12
* Estimation sample relative to raw and initial sample
* -----------------------------------------------------------------------------
log using ../logs/Table1_app_table12, replace
di "********************* Table 1 and Appendix Table 12  **********************"

// Define and save raw sample (1. 2. 3)
use patentsfam4, clear
// 1. firms with non-missing country
merge m:1 headq using WTOmembers_countries, keep(match) keepusing(headq wto1995)
// 2. firms with non-missing industry
drop if missing(nace2_1)
drop _merge
// 3. firms that file 1+ granted patent in 1992-2000
gen tmp = (granted > 0) * (y >= 1992 & y <= 2000)
keep if tmp == 1
contract hrm_l2_id nace2_1 wto1995
drop _freq
merge m:1 hrm_l2_id using hrm_sector, keep(match master)
drop _merge
count // Number of firms
save $tmp/raw_sample, replace

// Define and save initial sample (1. 2. 3. + 4.)
use $tmp/raw_sample, clear
// 4. Restrict to WTO countries by 1995
keep if wto1995 == 1
count // Number of firms
save $tmp/initial_sample, replace

// Save final sample
use reg_variables6Granted_WTO, clear
keep if wto1995 == 1
gen keep = (P > 0 & y == 1992)
keep if MissTar == 0 & homew < 1 & keep == 1
drop keep
contract hrm firm individual other_sector unknown_sector nace2_1
drop _freq
count // Number of firms
save $tmp/final_sample, replace 
* patentsfam3 includes all firms with pre-sample weights
* patentsfam4 includes all firms

// Combine all samples
use patentsfam4, clear
keep if y > 1991
merge m:1 hrm_l2_id using $tmp/raw_sample, keep(match master)
gen raw_sample = _merge == 3
gen initial_sample = (_merge == 3 & wto1995 == 1)
drop _merge
merge m:1 hrm_l2_id using $tmp/final_sample, update replace 
gen final_sample = _merge >= 3
drop _merge
keep if raw_sample == 1 | initial_sample == 1 | final_sample == 1
drop p
rename granted p
save $tmp/samples, replace


// Overall Delta_K for raw, initial and final sample and # firms patenting
use $tmp/samples, clear
foreach v of varlist *_sample {
	di "`v'"
	use $tmp/samples, clear
	keep if inrange(y, 1992, 2000)
	keep if `v' == 1
	keep if p > 0
	egen tag = tag(hrm_l2_id)
	collapse (sum) p firms = tag
	tabstat p firms
}

// Mean Delta_K etc, across firms, for initial and final sample
/* Note: Delta_K can be 0 in final sample */
use $tmp/samples, clear
foreach v of varlist *_sample {
	di "`v'"
	use $tmp/samples, clear
	keep if inrange(y, 1992, 2000)
	keep if `v' == 1
	gen px = p
	gen ptri = triadic
	gen pcit = p*citations3year
	collapse (sum) px ptri pcit, by(hrm_l2_id)
	collapse (mean) px ptri pcit (median) pmed=px ptrimed=ptri pcitmed=pcit ///
		(sd) psd=px ptrisd=ptri pcitsd=pcit
	list
}

// Number of firms in raw, initial and final sample
use $tmp/samples, clear
foreach v of varlist *_sample {
	di "`v'"
	use $tmp/samples, clear
	keep if `v' == 1
	contract hrm_l2_id
	collapse (count) firms = hrm_l2_id
	list
}

log close


* -----------------------------------------------------------------------------
* Section 4 (Data) and Appendix Table 8 
* Patentee type in initial and final sample
/* Firm, individual, other sector (e.g. university, government) or unknown */
* -----------------------------------------------------------------------------
cd ../data
log using ../logs/Patentee_type, replace
di "Patentee type, mean and median Delta_K by type  - Initial and final sample"

use $tmp/samples, clear
gen sector = "Firm" if firm == 1
replace sector = "Individual" if individual == 1
replace sector = "Other sector" if other_sector == 1
replace sector = "Unknown" if unknown_sector == 1
save $tmp/tmpsector, replace

// Share of firms by sector in raw, initial and sample
use $tmp/tmpsector, clear
foreach v of varlist *sample {
	di "`v'"
	use $tmp/tmpsector, clear
	keep if `v' == 1
	contract hrm_l2_id sector
	tab sector
}

// Mean and median Delta_K, across firms by sector, for initial and final sample
/* Note: Delta_K can be 0 in final sample */
use $tmp/tmpsector, clear
foreach v of varlist *sample {
	di "`v'"
	use $tmp/tmpsector, clear
	keep if `v' == 1
	keep if inrange(y, 1992, 2000)
	collapse (sum) p, by(sector hrm_l2_id)
	collapse (mean) p (median) pmed = p, by(sector)
	list
}

log close


* -----------------------------------------------------------------------------
* Figure 1: Share of Patenting Firms by Country
* Number of firms and patents in final sample, by headquarters country
* -----------------------------------------------------------------------------
cd ../data
global iff "MissTar==0 & homew<1"

use reg_variables6Granted_WTO, clear
keep if wto1995==1
keep if $iff & L8lnP != .
count if y==2000  // Number of firms

collapse (count) hrm if y==2000, by(headq)
count  // Number of countries
egen tot = sum(hrm)
gen sh = hrm/tot 
gsort -sh
gen top10 = 1 in 1/10
list if top10==1
graph bar (asis) sh if top10==1, over(headq, sort(sh) label(labsize(12pt))) ///
	scheme(lean1) ylabel(, labsize(12pt))
graph export "../graph/num_firms_hq_granted${wto}.eps", as(eps) preview(on) replace
graph save "../graph/num_firms_hq_granted${wto}.gph", replace


* -----------------------------------------------------------------------------
* Figure 1: Share of Patenting Firms by Industry
* Which sectors in the final sample
* -----------------------------------------------------------------------------
global iff "MissTar==0 & homew<1"
use reg_variables6Granted_WTO, clear
keep if wto1995==1
gen nace=int(nace2_1)
keep if $iff & L8lnP != .
collapse (count) hrm if y==2000, by(nace)
egen tot = sum(hrm)
gen sh = hrm/tot 
gsort -sh
gen top10 = 1 in 1/10
graph bar (asis) sh if top10==1, over(nace, sort(sh) label(labsize(12pt))) ///
	scheme(lean1) ylabel(, labsize(12pt))
graph export "../graph/num_firms_nace_granted${wto}.eps", as(eps) preview(on) replace
graph save "../graph/num_firms_nace_granted${wto}.gph", replace


* -----------------------------------------------------------------------------
* Figure 2
* Change in tauT for different countries 
* -----------------------------------------------------------------------------
* Get tau for all years
use "$tmp/tmpYY", clear
foreach vv of varlist taut tautApp tauthq tautPred tautAppPred tautImp tautAppImp {
  replace `vv' = `vv'/100
}
save "$tmp/tmpYY_all", replace

use reg_variables6Granted_WTO, clear
keep if wto1995 == 1
keep if $iff & L8lnP != .
collapse y, by(hrm headq nace2_1)
drop y
save $tmp/allfirms, replace

use "$tmp/tmpYY_all", clear
merge m:1 hrm headq nace2_1 using $tmp/allfirms, keep(match)

global iff "MissTar==0 & homew<1"
collapse (mean) taut* (count) hrm if $iff, by(headq y)
global year "y < 2005"
twoway line taut y if headq=="US" & $year || line  taut y if headq=="DE" & $year ///
	|| line taut y if headq=="JP" & $year|| || line taut y if headq=="GB" & $year, ///
	scheme(lean1) legend(order(1 "US" 2 "DE" 3 "JP" 4 "GB") ring(0) col(1) pos(1) size(12pt)) ///
	ytitle("T_bar (mean)", size(12pt)) xtitle("Year", size(12pt)) xscale(range(1990 2005)) ///
	xlabel(,labsize(12pt)) ylabel(,labsize(12pt))
graph export "../graph/tauC_cty_granted${wto}.eps", as(eps) preview(on) replace
graph save "../graph/tauC_cty_granted${wto}.gph", replace


* -----------------------------------------------------------------------------
* Figure 3
* Density of tauT 
* -----------------------------------------------------------------------------
// Density of tauT (using MFN tariff)
use reg_variables6Granted_WTO, clear
keep if wto1995 == 1
merge m:1 hrm headq nace2_1 using $tmp/allfirms, keep(match)
keep if y==1992 | y==2000
keep if  MissTar==0 & homew<1
keep hrm headq y taut
reshape wide taut, i(hrm headq) j(y)
twoway (hist taut1992 if taut1992<.20, scheme(lean1) bin(30) color(gs13) frac) ///
	(hist taut2000 if taut2000<.20, bin(30) fcolor(none) lcolor(black) frac), ///
	xtitle("(Weighted) average import tariff", size(12pt)) ytitle(, size(12pt)) ///
	legend(order(1 "1992" 2 "2000") ring(0) col(1) pos(1) size(12pt)) ///
	xlabel(,labsize(12pt)) ylabel(,labsize(12pt))
graph export "../graph/import_tariffs_firmlevel_granted${wto}.eps", as(eps) preview(on) replace
graph save "../graph/import_tariffs_firmlevel_granted${wto}", replace

// How many countries, industries and country-industries in final sample?
use reg_variables6Granted_WTO, clear
keep if wto1995 == 1
gen keep = (P>0 & y==1992)
keep if MissTar==0 & homew<1 & keep==1
keep hrm headq nace2_1
contract hrm headq nace2_1
drop _freq
save $tmp/hrm_headq_ind, replace
count // 41058 firms
use $tmp/hrm_headq_ind, clear
contract nace2_1
count // 54 industries
use $tmp/hrm_headq_ind, clear
contract headq 
count // 65 countries
use $tmp/hrm_headq_ind, clear
contract headq nace2_1
count // 1315 country-industry pairs


* -----------------------------------------------------------------------------
* Table 2
* Mean, median, sd, of knowledge stock, patents and tariffs over time 
* (1992, 2000, 2004)
* -----------------------------------------------------------------------------
log using ../logs/Table_2, replace
di "******************************** Table 2 *********************************"
use reg_variables6Granted_WTO, clear
merge m:1 hrm headq nace2_1 using $tmp/allfirms, keep(match)
collapse lnP taut tautApp (median) P mdlnP = lnP mdtaut = taut mdtautApp = tautApp ///
	(sd) sdlnP = lnP sdtaut = taut sdtautApp = tautApp (count) nb_firms = hrm, by(y)
list

log close


* -----------------------------------------------------------------------------
* Appendix Figure 5
* Missing country info
* -----------------------------------------------------------------------------
cd ../data
use pat_hldr_allperiod_2, clear
collapse (count) appln_id, by(hrm y headq nace)
gen missh = headq==""
gen missn = nace==.
collapse (count) hrm (sum) miss*, by(y)

gen shh = missh/hrm*100
gen shn = missn/hrm*100
label var shh "Missing country information"
label var shn "Missing industry information"
label var y "Year"
keep if y<=2004
line shh shn y, scheme(lean1) legend(ring(0) pos(2) size(12pt)) ///
	ytitle("Share of patentees, %", size(12pt)) xtitle(, size(12pt)) ///
	xlabel(,labsize(12pt)) ylabel(,labsize(12pt))
graph save ../graph/missing, replace
graph export ../graph/missing.eps, replace


* -----------------------------------------------------------------------------
* Appendix Figure 6
* Average industry-level tariffs
* -----------------------------------------------------------------------------
use TRAINS/allMFN_NACEr2_interpl, clear
drop isic4code
drop if year<1992 | year>2009
drop if tariff==.
bys iso2 nace2code: gen nyear=[_N]
keep if nyear>=18
save $tmp/ttmp, replace

* Aggregate to 3 digit
gen nace2_1 = int(nace2code*10)/10
collapse (mean) tariff*, by(iso2 nace2_1 year)
reshape wide tariff, i( iso2 nace2_1 ) j( year )
ren iso2 appln_auth
tostring nace, gen(tmp) format(%7.2f) force  // Drop 2 digit
drop if substr(tmp,4,2)=="00"
drop tmp
save $tmp/tariff3dig, replace

use $tmp/tariff3dig, clear
collapse (mean) tariff*, by(appln)

* Dummy for whether the firm is selling to only high-income countries (World Bank 1995 definition)
gen HI = (appln_auth=="AD" | appln_auth=="AE" | ///
	| appln_auth=="AT" | appln_auth=="AU" | appln_auth=="BE" | appln_auth=="BS" | appln_auth=="CA" | appln_auth=="CH" ///
	| appln_auth=="CY" | appln_auth=="DE" | appln_auth=="DK" | appln_auth=="ES" | appln_auth=="FI" ///
	| appln_auth=="FR" | appln_auth=="GB" | appln_auth=="HK" | appln_auth=="IE" | appln_auth=="IL" ///
	| appln_auth=="IS" | appln_auth=="IT" | appln_auth=="JP" | appln_auth=="KR" | appln_auth=="KW" ///
	| appln_auth=="KY" | appln_auth=="LI" | appln_auth=="LU" ///
	| appln_auth=="MC" | appln_auth=="NL" | appln_auth=="NO" | appln_auth=="NZ" | appln_auth=="PT" ///
	| appln_auth=="SE" | appln_auth=="SG" | appln_auth=="TW" | appln_auth=="UA" | appln_auth=="US") 

collapse (mean) tariff*, by(HI)
reshape long tariff, i(HI) j(year)
label var tariff "Ad-valorem tariff (mean)"
line tariff year if HI==1, yaxis(1) || line tariff year if HI==0, yaxis(2) scheme(lean1) ///
	legend(ring(0) pos(1) order(1 "High income (left)" 2 "Low income (right)") size(12pt)) ///
	xtitle(, size(12pt)) ytitle(, size(12pt) axis(1)) ytitle(, size(12pt) axis(2)) ///
	xlabel(,labsize(12pt)) ylabel(, labsize(12pt) axis(1)) ylabel(, labsize(12pt) axis(2))
graph export "../graph/mean_tariffs.eps", as(eps) preview(on) replace
graph save "../graph/mean_tariffs.gph", replace


* -----------------------------------------------------------------------------
* Appendix Figure 9 and Table 10
* Check how much the weights w change over time
* -----------------------------------------------------------------------------
log using ../logs/Appendix_Fig9_Table10, replace
di "******************** Appendix Figure 9 and Table 10 **********************"

use finalsample, clear
contract hrm
save $tmp/tmpfinal, replace

use weightswic5, clear
keep hrm appln headq w
ren w w1985
merge 1:1 appln hrm using weightswic5_85to95
ren w w1995
keep hrm appln headq w1985 w1995
merge 1:1 appln hrm using weightswic5_95to05
ren w w2005
keep hrm appln headq w1985 w1995 w2005
merge m:1 hrm using $tmp/tmpfinal, keep(match)
drop _merge _freq
foreach v of varlist w???? {
  replace `v'=0 if `v'==. 
}

label var w1985 "Weight, 1985"
label var w1995 "Weight, 1995"
label var w2005 "Weight, 2005"

gen D1985 = w1985>0
gen D1995 = w1995>0
gen D2005 = w2005>0
egen mkts85 = sum(D1985), by(hrm)
gen same1=D1985==1 & D1995==1
gen same2=D1985==1 & D2005==1
save $tmp/tmp, replace

* Intensive margin

use $tmp/tmp, clear 
lpoly w1995 w1985 if same1, noscatter ci kernel(epan2) nograph gen(xfit1 y1) se(se1)
lpoly w2005 w1985 if same2 , noscatter ci kernel(epan2) nograph gen(xfit2 y2) se(se2)
capture drop lb? ub?
gen lb1 = y1 - invnormal(.975)*se1
gen ub1 = y1 + invnormal(.975)*se1
gen lb2 = y2 - invnormal(.975)*se2
gen ub2 = y2 + invnormal(.975)*se2
sort xfit1
twoway rarea lb1 ub1 xfit1 || rarea lb2 ub2 xfit2 || ///
    line y1 xfit1 || line y2 xfit2 , scheme(lean1) xtitle("1985 weights", size(12pt)) ///
	ytitle("Future weights", size(12pt)) ///
	legend(on order(3 4) pos(5) ring(0) label(1 "") label(3 "1995") label(4 "2005") size(12pt)) ///
	xlabel(,labsize(12pt)) ylabel(,labsize(12pt))
graph export "../graph/persistence_weights.eps", as(eps) preview(on) replace
graph save "../graph/persistence_weights.gph", replace

reg w1995 w1985 if same1
reg w2005 w1985 if same2
pwcorr w1995 w1985 if same1, sig
pwcorr w2005 w1985 if same2, sig

* Aggreate exit rate
use $tmp/tmp, clear
collapse (mean) D1985 D1995 D2005 (semean) se85=D1985 se95=D1995 se05=D2005 if D1985==1
list

* Aggregate entry rate
use /tmp/tmp, clear
collapse (mean) D1985 D1995 D2005 (semean) se85=D1985 se95=D1995 se05=D2005 if D2005==1
list

* What is the overall probability of patenting in a market (1985) ? 
use /tmp/tmp, clear
collapse (count) D???? if D1985==1, by(hrm)
count // 72188 firms
use /tmp/tmp, clear
collapse (count) D???? if D1985==1, by(appln)
count // 51 destinations
use /tmp/tmp, clear
count if D1985==1 // 134547 firm-destinations
* Unconditional probability is 3.65%

use /tmp/tmp, clear
collapse (count) D???? if D1995==1, by(hrm)
count // 41280 firms
use /tmp/tmp, clear
collapse (count) D???? if D1995==1, by(appln)
count // 80 destinations
use /tmp/tmp, clear
count if D1995==1 // 108920 firm-destinations
* Unconditional probability is 3.30%

use /tmp/tmp, clear
collapse (count) D???? if D2005==1, by(hrm)
count // 51710 firms
use /tmp/tmp, clear
collapse (count) D???? if D2005==1, by(appln)
count // 89 destinations
use /tmp/tmp, clear
count if D2005==1 // 159256 firm-destinations
* Unconditional probability is 3.46%

log close


*------------------------------------------------------------------------------
* Section K, Appendix
*------------------------------------------------------------------------------
cd ../data
log using ../logs/Appendix_Section_K, replace
di "******************* Section K, Appendix, desctiptives ********************"

// Find top 10 countries by GDP in 2000 (Source, IMF WEO)
import excel Ctry_heterog_data/imf_GDP_20191209.xls, sheet("NGDPD") firstrow clear
rename (GDPcurrentpricesBillionsof N V) (country gdp1992 gdp2000)
keep country gdp1992 gdp2000
destring gdp1992, replace ignore("no data")
destring gdp2000, replace ignore("no data")
gsort - gdp2000
list

// Get all patents for each firm and year (from firm_patents2.do)
use patents_docdb, clear
drop mkts inEP inUS inJP
label var granted "Granted patent"
label var citations "Number of citations for the patent"
label var citations3year "Number of citations after 3 years for the patent"
label var num_inventors "Number of inventors for the patent"
label var num_ipc "Number of IPC codes for patent"
label var num_cpc "Number of CPC codes for patent"
save $tmp/patents_docdb, replace

// Save a temporary final sample with patents
use $tmp/patents_docdb, clear
merge m:1 hrm_l2_id using $tmp/final_sample, assert(match master)
keep if _merge == 3
drop _merge
save $tmp/finsample, replace

// Save a temporary initial sample with patents
use $tmp/patents_docdb, clear
merge m:1 hrm_l2_id using $tmp/initial_sample, assert(match master)
keep if _merge == 3
drop _merge
save $tmp/initsample, replace


*------------------------------------------------------------------------------
* Appendix Table 8 - Final sample
* Patents quality descriptives
*------------------------------------------------------------------------------
// TOTALS: Patents, granted patents, citations, inventors and patent breadh
// If a patent is co-owned, we count it only once
use $tmp/finsample, clear
collapse (max) granted (mean) citations3year num_inventors num_ipc num_cpc ///
	(min) y, by(docdb_family_id)
keep if y>=1992 & y<=2000 // sample period
preserve
collapse (count) totp = docdb_family_id (sum) totg = granted
gen sh_granted = totg/totp // Share of granted patents
tabstat totp totg sh_granted
restore
// Quality descriptives based on granted patents only
keep if granted == 1
collapse (sum) totg = granted (mean) citations3year num_inventors num_ipc num_cpc
tabstat totg citations3year num_inventors num_ipc num_cpc

// BY COUNTRY: Patents, granted patents, citations, inventors and patent breadh
// If a patent is co-owned by firms in different countries, we count it once
// for each countryuse $tmp/finsample, clear
use $tmp/finsample, clear
gen top10 = (headq=="US" | headq=="JP" | headq=="DE" | headq=="GB" | headq=="FR" ///
	| headq=="IT" | headq=="CA" | headq=="MX" | headq=="BR" | headq=="ES") 
	// 10 major economies in 2000 (nominal gdp)
collapse (max) granted (mean) citations3year num_inventors num_ipc num_cpc ///
	(min) y, by(docdb_family_id headq top10)
keep if y>=1992 & y<=2000 // sample period
preserve
collapse (count) totp=docdb_family_id (sum) totg=granted, by(headq top10)
gen sh_granted = totg/totp // Share of granted patents
tabstat totp totg sh_granted if top10 == 1, by(headq) nototal
restore
// Quality descriptives based on granted patents only
keep if granted == 1
collapse (sum) totg = granted (mean) citations3year num_inventors num_ipc ///
	num_cpc, by(headq top10)
tabstat totg citations3year num_inventors num_ipc num_cpc if top10 == 1, by(headq) nototal


* -----------------------------------------------------------------------------
* Appendix Table 9 - Final sample
* Patenting by industry (and country)
* -----------------------------------------------------------------------------
// Total number of patents in the world 1992-2000 by industry
// Drop duplicates in nace (2 digit)
// Note: a patent is only counted once in each nace (2 digit)
use $tmp/finsample, clear
gen nace = int(nace2_1) // 2-digit nace
collapse (max) granted (min) y, by(docdb_family_id nace)
keep if y>=1992 & y<=2000 // sample period
collapse (count) p = docdb (sum) granted, by(nace)
tabstat p granted, by(nace) nototal
egen totalg = total(granted)
sum totalg


// Total number of patents in the world 1992-2000 by industry and country (top 10)
// Drop duplicates in nace (2 digit) and country
// Note: a patent is only counted ones in each nace (2 digit) and country
use $tmp/finsample, clear
gen nace = int(nace2_1) // 2-digit nace
collapse (max) granted (min) y, by(docdb_family_id nace headq)
keep if y>=1992 & y<=2000 // sample period
gen top10 = (headq=="US" | headq=="JP" | headq=="DE" | headq=="GB" | headq=="FR" ///
	| headq=="IT" | headq=="CA" | headq=="MX" | headq=="BR" | headq=="ES") 
	// 10 major economies in 2000 (nominal gdp)
collapse (count) p = docdb (sum) g = granted, by(headq nace top10)
keep if top10 == 1
drop top10 
reshape wide p g, i(nace) j(headq) string
order nace pUS pJP pDE pGB pFR pIT pCA pMX pBR pES
tabstat p??, by(nace) nototal
order nace gUS gJP gDE gGB gFR gIT gCA gMX gBR gES
tabstat g??, by(nace) nototal


* -----------------------------------------------------------------------------
* Appendix Table 10 - Final sample
* Number of firms patenting by industry (and country)
* -----------------------------------------------------------------------------
// Total number of firms patenting in the world 1992-2000 by industry
use $tmp/finsample, clear
gen nace = int(nace2_1) // 2-digit nace
keep if y>=1992 & y<=2000 // sample period
collapse (sum) granted, by(hrm nace)
collapse (count) firms = hrm if granted > 0, by(nace)
tabstat firms, by(nace) nototal
// Check total number of firms is same as in final sample
egen totalf = total(firms)
sum total 

// Total number of firms patenting in the world 1992-2000 by industry and country
use $tmp/finsample, clear
gen nace = int(nace2_1) // 2-digit nace
keep if y>=1992 & y<=2000 // sample period
gen top10 = (headq=="US" | headq=="JP" | headq=="DE" | headq=="GB" | headq=="FR" ///
	| headq=="IT" | headq=="CA" | headq=="MX" | headq=="BR" | headq=="ES") 
	// 10 major economies in 2000 (nominal gdp)
collapse (sum) granted, by(hrm headq top10 nace)
collapse (count) firms = hrm if granted > 0, by(headq top10 nace)
keep if top10==1
drop top10 
reshape wide firms, i(nace) j(headq) string
order nace firmsUS firmsJP firmsDE firmsGB firmsFR firmsIT firmsCA firmsMX ///
	firmsBR firmsES
tabstat firms??, by(nace) nototal

log close


* -----------------------------------------------------------------------------
* Appendix: Distribution of firms and patents across countries and indutries
* Initial vs final sample
* -----------------------------------------------------------------------------
cd ../data/
log using ../logs/Appendix_firms_patents_distr, replace
di "******* Share of firms and patents across countries and industries *******"

* -----------------------------------------------------------------------------
* Appendix Figure 11
* Distribution of granted patents across countries - Initial vs final sample
* -----------------------------------------------------------------------------
// Countries share of granted patents (top 10) - Final sample
/* 	Note: if a patent is owned by firms in multiple countries, we assign the
	patent to each firm. Table 13 counts unique patents, so total is lower. */
use $tmp/finsample, clear
gen top10 = (headq=="US" | headq=="JP" | headq=="DE" | headq=="GB" | headq=="FR" ///
	| headq=="IT" | headq=="CA" | headq=="MX" | headq=="BR" | headq=="ES") 
	// 10 major economies in 2000 (nominal gdp)
collapse (max) granted (min) y, by(docdb_family_id headq top10)
keep if y>=1992 & y<=2000 // sample period
collapse (sum) granted, by(headq top10)
egen tot_granted = total(granted)
gen sh_granted = granted / tot_granted
egen top10_sh_granted = total(sh_granted) if top10 == 1
tabstat granted tot_granted sh_granted top10_sh_granted if top10 == 1, ///
	by(headq) nototal
drop top10_sh_granted
save $tmp/sh_granted_final, replace

// Countries share of granted patents (top 10) - Initial sample
use $tmp/initsample, clear
gen top10 = (headq=="US" | headq=="JP" | headq=="DE" | headq=="GB" | headq=="FR" ///
	| headq=="IT" | headq=="CA" | headq=="MX" | headq=="BR" | headq=="ES") 
	// 10 major economies in 2000 (nominal gdp)
collapse (max) granted (min) y, by(docdb_family_id headq top10)
keep if y>=1992 & y<=2000 // sample period
collapse (sum) granted, by(headq top10)
egen tot_granted = total(granted)
gen sh_granted = granted / tot_granted
egen top10_sh_granted = total(sh_granted) if top10 == 1
tabstat granted tot_granted sh_granted top10_sh_granted if top10 == 1, ///
	by(headq) nototal
drop top10_sh_granted

// Plot
foreach v of varlist granted sh_granted tot_granted {
	rename `v' `v'_init
}
merge 1:1 headq using $tmp/sh_granted_final
twoway (scatter sh_granted sh_granted_init) (lfit sh_granted sh_granted_init), ///
	scheme(lean1) legend(off) ytitle("Country share of patents - final sample", size(12pt)) ///
	xtitle("Country share of patents - initial sample", size(12pt)) ///
	xlabel(,labsize(12pt)) ylabel(,labsize(12pt))
graph export ../graph/ctry_patent_share.eps, replace

reg sh_granted sh_granted_init


* -----------------------------------------------------------------------------
* Appendix Figure 13
* Distribution of granted patents across industries - Initial vs final sample
* -----------------------------------------------------------------------------
// Industry share of granted patents - Final sample
/*	Note: a patent is only counted once in each nace (2 digit) industry.
	A patent owned by firms in different industries is counted once in each
	industry */
use $tmp/finsample, clear
gen nace = int(nace2_1) // 2-digit nace
collapse (max) granted (min) y, by(docdb_family_id nace)
keep if y>=1992 & y<=2000 // sample period
collapse (sum) granted, by(nace)
egen tot_granted = total(granted)
gen sh_granted = granted / tot_granted
tabstat granted tot_granted sh_granted, by(nace) nototal
save $tmp/sh_granted_final, replace

// Industry share of granted patents - Initial sample
use $tmp/initsample, clear
gen nace = int(nace2_1) // 2-digit nace
collapse (max) granted (min) y, by(docdb_family_id nace)
keep if y>=1992 & y<=2000 // sample period
collapse (sum) granted, by(nace)
egen tot_granted = total(granted)
gen sh_granted = granted / tot_granted
tabstat granted tot_granted sh_granted, by(nace) nototal

// Plot
foreach v of varlist granted sh_granted tot_granted {
	rename `v' `v'_init
}
merge 1:1 nace using $tmp/sh_granted_final
twoway (scatter sh_granted sh_granted_init) (lfit sh_granted sh_granted_init), ///
	scheme(lean1) legend(off) ytitle("Industry share of patents - final sample", size(12pt)) ///
	xtitle("Industry share of patents - initial sample", size(12pt)) ///
	xlabel(,labsize(12pt)) ylabel(,labsize(12pt))
graph export ../graph/ind_patent_share.eps, replace

reg sh_granted sh_granted_init


* -----------------------------------------------------------------------------
* Appendix Figure 12
* Distribution of firms across countries - Initial vs final sample
* -----------------------------------------------------------------------------
// Country share of firms - Final sample
use $tmp/finsample, clear
gen top10 = (headq=="US" | headq=="JP" | headq=="DE" | headq=="GB" | headq=="FR" ///
	| headq=="IT" | headq=="CA" | headq=="MX" | headq=="BR" | headq=="ES") 
contract hrm_l2_id headq top10
collapse (count) firms = hrm_l2_id, by(headq top10)
egen tot_firms = total(firms)
gen sh_firms = firms / tot_firms
egen top10_sh_firms = total(sh_firms) if top10 == 1
tabstat firms tot_firms sh_firms top10_sh_firms if top10 == 1, by(headq) nototal
save $tmp/sh_firms_final, replace

// Country share of firms - Initial sample
use $tmp/initsample, clear
gen top10 = (headq=="US" | headq=="JP" | headq=="DE" | headq=="GB" | headq=="FR" ///
	| headq=="IT" | headq=="CA" | headq=="MX" | headq=="BR" | headq=="ES") 
contract hrm_l2_id headq top10
collapse (count) firms = hrm_l2_id, by(headq top10)
egen tot_firms = total(firms)
gen sh_firms = firms / tot_firms
egen top10_sh_firms = total(sh_firms) if top10 == 1
tabstat firms tot_firms sh_firms top10_sh_firms if top10 == 1, by(headq) nototal

// Plot
foreach v of varlist firms tot_firms sh_firms {
	rename `v' `v'_init
}
merge 1:1 headq using $tmp/sh_firms_final
twoway (scatter sh_firms sh_firms_init) (lfit sh_firms sh_firms_init), ///
	scheme(lean1) legend(off) ytitle("Country share of firms - final sample", size(12pt)) ///
	xtitle("Country share of firms - initial sample", size(12pt)) ///
	xlabel(,labsize(12pt)) ylabel(,labsize(12pt))
graph export ../graph/ctry_firms_share.eps, replace

reg sh_firms sh_firms_init


* -----------------------------------------------------------------------------
* Appendix Figure 14
* Distribution of firms across industries - Initial vs final sample
* -----------------------------------------------------------------------------
// Industry share of firms - Initial sample
use $tmp/finsample, clear
gen nace = int(nace2_1) // 2-digit nace
contract hrm_l2_id nace
collapse (count) firms = hrm_l2_id, by(nace)
egen tot_firms = total(firms)
gen sh_firms = firms / tot_firms
tabstat firms tot_firms sh_firms, by(nace) nototal
save $tmp/sh_firms_final, replace

// Industry share of firms - Final sample
use $tmp/initsample, clear
gen nace = int(nace2_1) // 2-digit nace
contract hrm_l2_id nace
collapse (count) firms = hrm_l2_id, by(nace)
egen tot_firms = total(firms)
gen sh_firms = firms / tot_firms
tabstat firms tot_firms sh_firms, by(nace) nototal

// Plot
foreach v of varlist firms tot_firms sh_firms {
	rename `v' `v'_init
}
merge 1:1 nace using $tmp/sh_firms_final
twoway (scatter sh_firms sh_firms_init) (lfit sh_firms sh_firms_init), ///
	scheme(lean1) legend(off) ytitle("Industry share of firms - final sample", size(12pt)) ///
	xtitle("Industry share of firms - initial sample", size(12pt)) ///
	xlabel(,labsize(12pt)) ylabel(,labsize(12pt))
graph export ../graph/ind_firms_share.eps, replace

reg sh_firms sh_firms_init

log close
